{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from utils import tokenize, load_curpus\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 加载数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Dumping model to file cache /var/folders/dy/xjy0y7v97js5x1bghby2fnkm0000gn/T/jieba.cache\n",
      "Loading model cost 1.098 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "train_data = load_curpus(\"weibo2018/train.txt\")\n",
    "test_data = load_curpus(\"weibo2018/test.txt\")\n",
    "train_df = pd.DataFrame(train_data, columns=[\"content\", \"sentiment\"])\n",
    "test_df = pd.DataFrame(test_data, columns=[\"content\", \"sentiment\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "加载停用词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "stopwords = []\n",
    "with open(\"stopwords.txt\", \"r\", encoding=\"utf8\") as f:\n",
    "    for w in f:\n",
    "        stopwords.append(w.strip())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "TfIdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/dengxiuqi/opt/anaconda3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py:300: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['元', '吨', '数', '末'] not in stop_words.\n",
      "  'stop_words.' % sorted(inconsistent))\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "data_str = [\" \".join(content) for content, sentiment in train_data] + \\\n",
    "            [\" \".join(content) for content, sentiment in test_data]\n",
    "tfidf = TfidfVectorizer(token_pattern='\\[?\\w+\\]?', stop_words=stopwords)\n",
    "tfidf_fit = tfidf.fit_transform(data_str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "加载之前训练好的FastText模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from gensim.models import FastText\n",
    "model = FastText.load(\"model/model_100.txt\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "最多只保留Tf-Idf最高的前多少个词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "key_words = 30"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 用每个词的Tfidf作为权重, 对FastText词向量进行加权, 得到表征每个句子的向量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/dengxiuqi/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\n",
      "  import sys\n",
      "/Users/dengxiuqi/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:8: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "X_train, y_train = [], []\n",
    "for content, sentiment in train_data:\n",
    "    X, y = [], sentiment\n",
    "    X_tfidf = tfidf.transform([\" \".join(content)]).toarray()\n",
    "    keywords_index = np.argsort(-X_tfidf)[0, :key_words]\n",
    "    for w in content:\n",
    "        if w in model and w in tfidf.vocabulary_ and tfidf.vocabulary_[w] in keywords_index:\n",
    "            X.append(np.expand_dims(model[w], 0) * X_tfidf[0, tfidf.vocabulary_[w]])\n",
    "    if X:\n",
    "        X = np.concatenate(X)\n",
    "        X = np.mean(X, axis=0)\n",
    "        X_train.append(X)\n",
    "        y_train.append(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/dengxiuqi/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\n",
      "  import sys\n",
      "/Users/dengxiuqi/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:8: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "X_test, y_test = [], []\n",
    "for content, sentiment in test_data:\n",
    "    X, y = [], sentiment\n",
    "    X_tfidf = tfidf.transform([\" \".join(content)]).toarray()\n",
    "    keywords_index = np.argsort(-X_tfidf)[0, :key_words]\n",
    "    for w in content:\n",
    "        if w in model and w in tfidf.vocabulary_ and tfidf.vocabulary_[w] in keywords_index:\n",
    "            X.append(np.expand_dims(model[w], 0) * X_tfidf[0, tfidf.vocabulary_[w]])\n",
    "    if X:\n",
    "        X = np.concatenate(X)\n",
    "        X = np.mean(X, axis=0)\n",
    "        X_test.append(X)\n",
    "        y_test.append(y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### SVM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/dengxiuqi/opt/anaconda3/lib/python3.7/site-packages/sklearn/svm/base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n",
      "  \"avoid this warning.\", FutureWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "SVC(C=1, cache_size=200, class_weight={0: 1.0, 1: 0.95}, coef0=0.0,\n",
       "    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',\n",
       "    kernel='rbf', max_iter=-1, probability=False, random_state=None,\n",
       "    shrinking=True, tol=0.001, verbose=False)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn import svm\n",
    "clf = svm.SVC(C=1, class_weight={1: .95, 0: 1.})\n",
    "clf.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "result = clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.70      0.75      0.72       155\n",
      "           1       0.89      0.85      0.87       344\n",
      "\n",
      "    accuracy                           0.82       499\n",
      "   macro avg       0.79      0.80      0.80       499\n",
      "weighted avg       0.83      0.82      0.82       499\n",
      "\n",
      "准确率: 0.8216432865731463\n"
     ]
    }
   ],
   "source": [
    "from sklearn import metrics\n",
    "print(metrics.classification_report(y_test, result))\n",
    "print(\"准确率:\", metrics.accuracy_score(y_test, result))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
